Google form analysis tests

Purpose: determine in what extent the current data can accurately describe correlations, underlying factors on the score. Especially concerning the 'before' groups: are there underlying groups explaining the discrepancies in score? Are those groups tied to certain questions?

Table of Contents

Sorted total answers to questions

Cross-samples t-tests

PCAs




In [ ]:
%run "../Functions/1. Google form analysis.ipynb"

PCAs

Purpose: find out which questions have the more weight in the computation of the score.

Other leads: LDA, ANOVA.

Source for PCA: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html


In [ ]:
binarized = getAllBinarized()

In [ ]:
score = np.dot(binarized,np.ones(len(binarized.columns)))

In [ ]:
dimensions = binarized.shape[1]
dimensions

In [ ]:
binarized['class'] = 'default'

In [ ]:
# split data table into data X and class labels y

X = binarized.iloc[:,0:dimensions].values
y = binarized.iloc[:,dimensions].values

Standardizing


In [ ]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

1 - Eigendecomposition - Computing Eigenvectors and Eigenvalues

Covariance Matrix


In [ ]:
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
print('Covariance matrix \n%s' %cov_mat)

In [ ]:
print('NumPy covariance matrix: \n%s' %np.cov(X_std.T))

eigendecomposition on the covariance matrix:


In [ ]:
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)

#print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)

Correlation Matrix

Eigendecomposition of the standardized data based on the correlation matrix:


In [ ]:
cor_mat1 = np.corrcoef(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cor_mat1)

#print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)

Eigendecomposition of the raw data based on the correlation matrix:

cor_mat2 = np.corrcoef(binarized.T) eig_vals, eig_vecs = np.linalg.eig(cor_mat2)

print('Eigenvectors \n%s' %eig_vecs)

print('\nEigenvalues \n%s' %eig_vals)

Singular Vector Decomposition


In [ ]:
u,s,v = np.linalg.svd(X_std.T)

In [ ]:
s

2 - Selecting Principal Components


In [ ]:
for ev in eig_vecs:
    np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))
print('Everything ok!')

In [ ]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])

In [ ]:
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(6, 4))

    plt.bar(range(dimensions), var_exp, alpha=0.5, align='center',
            label='individual explained variance')
    plt.step(range(dimensions), cum_var_exp, where='mid',
             label='cumulative explained variance')
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.tight_layout()

In [ ]:
var_exp[:5]

In [ ]:
cum_var_exp[:5]

Projection Matrix


In [ ]:
matrix_w = np.hstack((eig_pairs[0][1].reshape(dimensions,1),
                      eig_pairs[1][1].reshape(dimensions,1)))

print('Matrix W:\n', matrix_w)

3 - Projection Onto the New Feature Space


In [ ]:
gform.columns

In [ ]:
colors = ('blue','red','green','magenta','cyan','purple','yellow','black','white')
len(colors)

In [ ]:
Y = X_std.dot(matrix_w)

In [ ]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(6, 4))
    ax = plt.subplot(111)
    plt.scatter(Y[:, 0], Y[:, 1])
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title("base PCA")
    plt.show()

import mca

X = binarized.iloc[:,0:dimensions].values y = binarized.iloc[:,dimensions].values

X_std.shape

xstddf = pd.DataFrame(X_std)

Y2 = mca.MCA(xstddf, ncols=dimensions)

with plt.style.context('seaborn-whitegrid'): plt.figure(figsize=(6, 4)) ax = plt.subplot(111) plt.scatter(Y2[:, 0], Y2[:, 1]) plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.title("base MCA") plt.show()


In [ ]:
# classNames is a tuple
def classifyAndPlot(classNames, classes, title = '', rainbow = False):
    defaultClassName = ''

    sampleSize = 0
    for classIndex in range(0, len(classes)):
        sampleSize += len(classes[classIndex])
    if(sampleSize < gform.shape[0]):
        if(len(classNames) == len(classes) + 1):
            defaultClassName = classNames[-1]
        else:
            defaultClassName = 'other'
            classNames.append(defaultClassName)

    for labelIndex in binarized.index:
        i = int(labelIndex[len('corrections'):])
        isUserSet = False
        for classIndex in range(0, len(classes)):
            if(gform.iloc[i][localplayerguidkey] in classes[classIndex].values):
                binarized.loc[labelIndex,'class'] = classNames[classIndex]
                isUserSet = True
        if not isUserSet:
            if not (defaultClassName in classNames):
                print("unexpected error: check the exhaustiveness of the provided classes")
            binarized.loc[labelIndex,'class'] = defaultClassName
    y = binarized.iloc[:,dimensions].values

    with plt.style.context('seaborn-whitegrid'):
        plt.figure(figsize=(6, 4))
        ax = plt.subplot(111)

        colors = ('blue','red','green','magenta','cyan','purple','yellow','black','white')
        if (rainbow or len(classNames) > len(colors)):
            colors = plt.cm.rainbow(np.linspace(1, 0, len(classNames)))
        colors = colors[:len(classNames)]

        for lab, col in zip(classNames,colors):
            plt.scatter(Y[y==lab, 0],
                        Y[y==lab, 1],
                        label=lab,
                        c=col)
        plt.xlabel('Principal Component 1')
        plt.ylabel('Principal Component 2')

        # source https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot
        # Put a legend to the right of the current axis
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

        if(len(title) > 0):
            plt.title(title)

        plt.show()

In [ ]:
answered = binarized[binarized['Guess: the bacterium would glow yellow...'] == 1]
indices = answered.index.map(lambda label: int(label[len('corrections'):]))
surveys = gform.iloc[indices][localplayerguidkey]
#classifyAndPlot(['guessed', 'did not'], [surveys])
title = 'test title'
rainbow = True
alreadyDefaultClassName = True

classNames = ['guessed', 'did not']
classes = [surveys]

# classNames is a tuple
#def classifyAndPlot(classNames, classes, title = '', rainbow = False):
defaultClassName = ''

sampleSize = 0
for classIndex in range(0, len(classes)):
    sampleSize += len(classes[classIndex])
if(sampleSize < gform.shape[0]):
    if(len(classNames) == len(classes) + 1):
        defaultClassName = classNames[-1]
    else:
        defaultClassName = 'other'
        classNames.append(defaultClassName)
    
for labelIndex in binarized.index:
    i = int(labelIndex[len('corrections'):])
    isUserSet = False
    for classIndex in range(0, len(classes)):
        if(gform.iloc[i][localplayerguidkey] in classes[classIndex].values):
            binarized.loc[labelIndex,'class'] = classNames[classIndex]
            isUserSet = True
    if not isUserSet:
        if not (defaultClassName in classNames):
            print("unexpected error: check the exhaustiveness of the provided classes")
        binarized.loc[labelIndex,'class'] = defaultClassName
y = binarized.iloc[:,dimensions].values

with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(6, 4))
    ax = plt.subplot(111)

    colors = ('blue','red','green','magenta','cyan','purple','yellow','black','white')
    if (rainbow or len(classNames) > len(colors)):
        colors = plt.cm.rainbow(np.linspace(1, 0, len(classNames)))
    colors = colors[:len(classNames)]

    for lab, col in zip(classNames,colors):
        plt.scatter(Y[y==lab, 0],
                    Y[y==lab, 1],
                    label=lab,
                    c=col)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')

    # source https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot
    # Put a legend to the right of the current axis
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    if(len(title) > 0):
        plt.title(title)

    plt.show()

In [ ]:
answered = binarized[binarized['Guess: the bacterium would glow yellow...'] == 1]
indices = answered.index.map(lambda label: int(label[len('corrections'):]))
surveys = gform.iloc[indices][localplayerguidkey]
classifyAndPlot(['guessed', 'did not'], [surveys])

In [ ]:
classifyAndPlot(['biologist', 'other'], [getSurveysOfBiologists(gform, True)[localplayerguidkey]], title = 'biologists and non-biologists')

In [ ]:
classifyAndPlot(['gamer', 'other'], [getSurveysOfGamers(gform, True)[localplayerguidkey]], title = 'gamers and non-gamers')

In [ ]:
classNames = []
classes = []
for answer in gform['Are you interested in biology?'].value_counts().index:
    classNames.append(answer)
    classes.append(gform[gform['Are you interested in biology?'] == answer][localplayerguidkey])
classNames.append('other')
classifyAndPlot(classNames, classes, rainbow = True, title = 'interest in biology')

TODO: find simple way to plot scores


In [ ]:
#np.plot(score)

In [ ]:
classNames = []
classes = []
for thisScore in np.unique(score):
    classNames.append(thisScore)
    index = np.where(score == thisScore)[0]
    classes.append( gform.loc[index][localplayerguidkey])
classifyAndPlot(classNames, classes, rainbow = True, title = 'score')

In [ ]:
classNames = []
classes = []
question = 'How old are you?'
for answer in np.sort(gform[question].unique()):
    classNames.append(answer)
    classes.append(gform[gform[question] == answer][localplayerguidkey])
classifyAndPlot(classNames, classes, rainbow = True, title = 'age')

In [ ]:
gform.columns[:5]

In [ ]:
# questions to avoid:
#0 Timestamp
#3 Age
#40 Remarks
#41 ID

from itertools import chain
questionRange = chain(range(1,3), range(4,40), range(42,44))
for questionIndex in questionRange:
    question = gform.columns[questionIndex]
    classNames = []
    classes = []
    for answer in gform[question].value_counts().index:
        classNames.append(answer)
        classes.append(gform[gform[question] == answer][localplayerguidkey])
    classifyAndPlot(classNames, classes, title = question, rainbow = False)

In [ ]:
eig_vals

In [ ]:
eig_vecs[0]

In [ ]:
maxComponentIndex = np.argmax(abs(eig_vecs[0]))
binarized.columns[maxComponentIndex]

In [ ]:
sum(eig_vecs[0]*eig_vecs[0])
eig_vecs[0]

In [ ]:
sortedIndices = []
descendingWeights = np.sort(abs(eig_vecs[0]))[::-1]
for sortedComponent in descendingWeights:
    sortedIndices.append(np.where(abs(eig_vecs[0]) == sortedComponent)[0][0])
sortedQuestions0 = pd.DataFrame(index = descendingWeights, data = binarized.columns[sortedIndices])
sortedQuestions0

In [ ]:


In [ ]:
def accessFirst(a):
    return a[0]
sortedQuestionsLastIndex = 10
array1 = np.arange(sortedQuestionsLastIndex+1.)/(sortedQuestionsLastIndex + 1.)
sortedQuestionsLastIndex+1,\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Accent(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Dark2(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Paired(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Pastel1(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Pastel2(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Set1(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Set2(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Set3(array1)))),\

In [ ]:
from matplotlib import cm

def displayQuestionsContributions(\
                                  sortedQuestions,\
                                  title = "Contributions of questions to component",\
                                  sortedQuestionsLastIndex = 10\
                                 ):
    colors=cm.Set3(np.arange(sortedQuestionsLastIndex+1.)/(sortedQuestionsLastIndex + 1.))
    
    sortedQuestionsLabelsArray = np.append(sortedQuestions.values.flatten()[:sortedQuestionsLastIndex], 'others')
    sortedQuestionsValuesArray = np.append(sortedQuestions.index[:sortedQuestionsLastIndex], sum(sortedQuestions.index[sortedQuestionsLastIndex:]))

    fig1, ax1 = plt.subplots()

    ax1.pie(sortedQuestionsValuesArray, labels=sortedQuestionsLabelsArray, autopct='%1.1f%%', startangle=100, colors = colors)
    ax1.axis('equal')
    
    # cf https://matplotlib.org/users/customizing.html
    plt.rcParams['patch.linewidth'] = 0
    plt.rcParams['text.color'] = '#2b2b2b'
    
    plt.title(title)
    plt.tight_layout()
    plt.show()

In [ ]:
displayQuestionsContributions(sortedQuestions0, sortedQuestionsLastIndex = 10, title = 'Contributions of questions to component 1')

In [ ]:
sum(sortedQuestions0.index**2)

In [ ]:
sortedIndices = []
descendingWeights = np.sort(abs(eig_vecs[1]))[::-1]
for sortedComponent in descendingWeights:
    sortedIndices.append(np.where(abs(eig_vecs[1]) == sortedComponent)[0][0])
sortedQuestions1 = pd.DataFrame(index = descendingWeights, data = binarized.columns[sortedIndices])
sortedQuestions1

In [ ]:
displayQuestionsContributions(sortedQuestions1, sortedQuestionsLastIndex = 10, title = 'Contributions of questions to component 2')

In [ ]:
sum(sortedQuestions1.index**2)